library(phyloseq)
library(ggplot2)
library(ggsignif)
library(cowplot)
library(tidyverse)
library(reshape2)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap
# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-Mars-EDA")
# Import phyloseq object
physeq.mars <- readRDS(file.path(path, "phyloseq-objects/physeq_mars.rds"))
# Sanity check
physeq.mars
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1557 taxa and 69 samples ]
## sample_data() Sample Data: [ 69 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 1557 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 1557 tips and 1555 internal nodes ]
## refseq() DNAStringSet: [ 1557 reference sequences ]
Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.
# Look at the tree
plot_tree(physeq.mars, color = "Phylum", ladderize="left")
This dataset has several covariates (gender, age, bmi). We will check whether there is the same distribution of these covariates between healthy and IBS patients.
# Number of individuals in each group (keeping just 1 sample per individual)
metadata <- data.frame(sample_data(physeq.mars)) %>%
select(host_disease, host_bmi, host_age, host_sex, host_ID) %>%
group_by(host_ID) %>%
summarise_all(first)
metadata %>%
count(host_disease)
# Age
metadata %>%
group_by(host_disease) %>%
summarize(mean_age=mean(host_age), sd_age=sd(host_age))
wilcox.test(metadata[metadata$host_disease == "IBS", ]$host_age,
metadata[metadata$host_disease == "Healthy", ]$host_age,
correct=FALSE) # p=0.84
##
## Wilcoxon rank sum test
##
## data: metadata[metadata$host_disease == "IBS", ]$host_age and metadata[metadata$host_disease == "Healthy", ]$host_age
## W = 155.5, p-value = 0.8429
## alternative hypothesis: true location shift is not equal to 0
# Gender
metadata %>%
count(host_disease, host_sex)
# Can't use chi^2 test because one of the values < 5
fisher.test(data.frame("Female" = c(8,21),
"Male" = c(4,6),
row.names = c("Healthy", "IBS"))) # p=0.7
##
## Fisher's Exact Test for Count Data
##
## data: data.frame(Female = c(8, 21), Male = c(4, 6), row.names = c("Healthy", "IBS"))
## p-value = 0.6927
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.1020796 3.5642776
## sample estimates:
## odds ratio
## 0.5801387
# BMI
metadata %>%
group_by(host_disease) %>%
summarize(mean_bmi=mean(na.omit(host_bmi)), sd_bmi=sd(na.omit(host_bmi)))
wilcox.test(metadata[metadata$host_disease == "IBS",]$host_bmi,
metadata[metadata$host_disease == "Healthy", ]$host_bmi,
correct=FALSE) # 0.98
##
## Wilcoxon rank sum test
##
## data: metadata[metadata$host_disease == "IBS", ]$host_bmi and metadata[metadata$host_disease == "Healthy", ]$host_bmi
## W = 163, p-value = 0.9757
## alternative hypothesis: true location shift is not equal to 0
# Plot Phylum
plot_bar(physeq.mars, fill = "Phylum") + facet_wrap("host_disease", scales="free_x") +
theme_cowplot()+
theme(axis.text.x = element_text(size = 6, angle=90))+
labs(x = "Samples", y = "Absolute abundance", title = "Mars dataset (2020)")
# ggsave(file.path(path.plots, "absAbundance_phylum.jpg"), width=13, height=5)
# Plot Class
plot_bar(physeq.mars, fill = "Class")+ facet_wrap("host_disease", scales="free_x") +
theme_cowplot()+
theme(axis.text.x = element_text(size = 6, angle=90))+
labs(x = "Samples", y = "Absolute abundance", title = "Mars dataset (2020)")
Sequencing depth characteristics of the Mars dataset:
- minimum of 1140 total count per sample
- median: 3.255610^{4} total count per sample
- maximum of 6.360210^{4} total count per sample
# Agglomerate to phylum & class levels
phylum.table <- physeq.mars %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
class.table <- physeq.mars %>%
tax_glom(taxrank = "Class") %>%
transform_sample_counts(function(x) {x/sum(x)} ) %>%
psmelt()
sample_data(physeq.mars)$host_ID <- as.character(sample_data(physeq.mars)$host_ID)
# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
y = Abundance, fill = Phylum))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme_cowplot()+
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Relative abundance", title = "Mars dataset (2020)")
# ggsave(file.path(path.plots, "relAbundance_phylum.jpg"), width=10, height=5)
ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
y = Abundance, fill = Class))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme_cowplot()+
theme(axis.text.x = element_blank(),
legend.key.size = unit(0.2, 'cm'),
legend.text = element_text(size=8))+
labs(x = "Samples", y = "Relative abundance", title = "Mars dataset (2020)")
# ggsave(file.path(path.plots, "relAbundance_class.jpg"), width=12, height=5)
# As curiosity, group samples per host_ID (is there a big difference between collection times?)
ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
facet_wrap(~ host_ID, scales = "free_x", nrow=2) + # scales = "free" removes empty lines
geom_bar(stat = "identity", width=1) +
theme_cowplot()+
theme(axis.text.x = element_blank(),
panel.spacing = unit(0.1, "lines"),
strip.text = element_text(size=8),
legend.key.size = unit(0.2, 'cm'),
legend.text = element_text(size=8))+
labs(x = "Samples", y = "Relative abundance", title = "Mars dataset (2020)")
# Extract abundance of only Bacteroidota and Firmicutes
relevant.covariates <- c('Sample', 'Abundance', 'host_disease', 'Phylum', 'host_ID', 'host_subtype', 'Collection', 'host_sex')
bacter <- phylum.table %>%
filter(Phylum == "Bacteroidota") %>%
select(all_of(relevant.covariates)) %>%
dplyr::rename(Bacteroidota = Abundance) %>%
select(-Phylum)
firmi <- phylum.table %>%
filter(Phylum == "Firmicutes") %>%
select(all_of(relevant.covariates)) %>%
dplyr::rename(Firmicutes = Abundance) %>%
select(-Phylum)
# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- left_join(x=bacter, y=firmi, by=c('Sample', 'host_disease', 'host_ID', 'host_subtype', 'Collection', 'host_sex')) %>%
relocate(Firmicutes, .after=Bacteroidota) %>%
# Compute log ratios
mutate(logRatioFB = log2(Firmicutes/Bacteroidota))
# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
geom_violin(aes(fill=host_disease))+
scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
geom_jitter(width=0.1)+
geom_signif(comparisons = list(c("Healthy", "IBS")), map_signif_level = TRUE, test="wilcox.test") +
labs(x = "", y = 'Log2(Firmicutes/Bacteroidota)')+
theme_cowplot()+
theme(legend.position="none")
# ggsave(file.path(path.plots, "ratioFB.jpg"), width=4, height=6)
# Statistical test
wilcox.test(ratio.FB[ratio.FB$host_disease == "IBS","logRatioFB"],
ratio.FB[ratio.FB$host_disease == "Healthy","logRatioFB"]) # p = 0.6
##
## Wilcoxon rank sum exact test
##
## data: ratio.FB[ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 579, p-value = 0.63
## alternative hypothesis: true location shift is not equal to 0
# What about per Collection time point?
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
geom_violin(aes(fill=host_disease))+
scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
geom_jitter(width=0.1)+
facet_wrap(~Collection) +
geom_signif(comparisons = list(c("Healthy", "IBS")), map_signif_level = TRUE, test="wilcox.test") +
labs(x = "", y = 'Log2(Firmicutes/Bacteroidota)')+
theme_cowplot()+
theme(legend.position="none")
ggplot(ratio.FB, aes(x = Collection, y = logRatioFB))+
geom_violin(aes(fill=host_disease))+
scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
geom_point(size = 2)+
facet_wrap(~host_disease) +
geom_line(aes(group=host_ID), lwd=0.2) +
labs(x = "", y = 'Log2(Firmicutes/Bacteroidota)')+
theme_cowplot()+
theme(legend.position="none")
# ggsave(file.path(path.plots, "ratioFB_paired-data.jpg"), width=4, height=6)
# Plot by IBS subtype
ggplot(ratio.FB, aes(x = host_subtype, y = logRatioFB))+
facet_wrap(~Collection) +
geom_violin(aes(fill=host_subtype))+
scale_fill_manual(values=scales::alpha(c("blue", "brown", "red"), .3))+
geom_jitter(width=0.1)+
geom_signif(comparisons = list(c("HC", "IBS-C"), c("HC", "IBS-D")),
map_signif_level = TRUE, test="wilcox.test", step_increase=0.1) +
labs(x = "", y = 'Log2(Firmicutes/Bacteroidota)')+
theme_cowplot()+
theme(legend.position="none")
# ggsave(file.path(path.plots, "ratioFB_subtype.jpg"), width=5, height=5)
# Statistical test HC vs IBS-C
wilcox.test(ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC","logRatioFB"],
ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-C","logRatioFB"]) # p = 0.04
##
## Wilcoxon rank sum exact test
##
## data: ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-C", "logRatioFB"]
## W = 44, p-value = 0.04069
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC","logRatioFB"],
ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-C","logRatioFB"]) # p = 0.9
##
## Wilcoxon rank sum exact test
##
## data: ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-C", "logRatioFB"]
## W = 56, p-value = 0.917
## alternative hypothesis: true location shift is not equal to 0
# Statistical test HC vs IBS-D
wilcox.test(ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC","logRatioFB"],
ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-D","logRatioFB"]) # p = 0.7
##
## Wilcoxon rank sum exact test
##
## data: ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-D", "logRatioFB"]
## W = 60, p-value = 0.7399
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC","logRatioFB"],
ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-D","logRatioFB"]) # p = 0.29
##
## Wilcoxon rank sum exact test
##
## data: ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-D", "logRatioFB"]
## W = 84, p-value = 0.2875
## alternative hypothesis: true location shift is not equal to 0
# Just for curiosty, look by host_sex
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
geom_violin(aes(fill=host_disease))+
scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
geom_jitter(width=0.1)+
facet_wrap(~host_sex) +
labs(x = "", y = 'Log2(Firmicutes/Bacteroidota)')+
theme_cowplot()+
theme(legend.position="none")
# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.mars)<500) # all FALSE
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.mars
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts
# Sanity check that 0 values have been replaced
# otu_table(physeq.mars)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]
# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1
# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/Mars-2020/02_EDA-Mars/physeq_NZcomp.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.mars
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)
# check the counts are all relative
# otu_table(physeq.mars)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]
# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1
# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/Mars-2020/02_EDA-Mars/physeq_relative.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.mars
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )
# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total
# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/Mars-2020/02_EDA-Mars/physeq_CSN.rds"))
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.mars
physeq.clr <- microbiome::transform(physeq.mars, "clr") # the function adds pseudocounts itself
# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
# otu_table(physeq.mars)[1:5, 1:5] # should contain absolute counts
# otu_table(physeq.clr)[1:5, 1:5] # should all be relative
# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/Mars-2020/02_EDA-Mars/physeq_clr.rds"))
First, let’s look at these four distances of interest.
#____________________________________________________________________________________
# Measure distances
getDistances <- function(relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
# sanity check
cat("nb samples relPhyseq:", nsamples(relPhyseq), "\n")
cat("nb samples clrPhyseq:", nsamples(clrPhyseq), "\n")
cat("nb samples csnPhyseq:", nsamples(csnPhyseq), "\n")
cat("nb samples nzcompPhyseq:", nsamples(nzcompPhyseq), "\n")
# Compute distances
print("Unifrac...")
set.seed(123) # for unifrac, need to set a seed
glom.UniF <- UniFrac(relPhyseq, weighted=TRUE, normalized=TRUE) # weighted unifrac
print("Aitchison...")
glom.ait <- phyloseq::distance(clrPhyseq, method = 'euclidean') # aitchison
print("Bray des bois...")
glom.bray <- phyloseq::distance(csnPhyseq, method = "bray") # bray-curtis
print("Canberra <3...")
glom.can <- phyloseq::distance(nzcompPhyseq, method = "canberra") # canberra
dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
return(dist.list)
}
#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS", relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
plist <- NULL
plist <- vector("list", 4)
names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
print("Unifrac")
# Weighted UniFrac
set.seed(123)
iMDS.UniF <- ordinate(relPhyseq, ordination, distance=dlist$UniF)
plist[[1]] <- plot_ordination(relPhyseq, iMDS.UniF, color="host_disease")
print("Aitchison")
# Aitchison
set.seed(123)
iMDS.Ait <- ordinate(clrPhyseq, ordination, distance=dlist$Ait)
plist[[2]] <- plot_ordination(clrPhyseq, iMDS.Ait, color="host_disease")
print("Bray")
# Bray-Curtis
set.seed(123)
iMDS.Bray <- ordinate(csnPhyseq, ordination, distance=dlist$Bray)
plist[[3]] <- plot_ordination(csnPhyseq, iMDS.Bray, color="host_disease")
print("Canberra")
# Canberra
set.seed(123)
iMDS.Can <- ordinate(nzcompPhyseq, ordination, distance=dlist$Can)
plist[[4]] <- plot_ordination(nzcompPhyseq, iMDS.Can, color="host_disease")
# Creating a dataframe to plot everything
plot.df = plyr::ldply(plist, function(x) x$data)
names(plot.df)[1] <- "distance"
return(plot.df)
}
Now let’s plot!
#___________________________
# 1ST COLLECTION TIME POINT
#___________________________
# Get the distances & the plot data
dist.mars.1st <- getDistances(relPhyseq = subset_samples(physeq.rel, Collection=="1st"),
clrPhyseq = subset_samples(physeq.clr, Collection=="1st"),
csnPhyseq = subset_samples(physeq.CSN, Collection=="1st"),
nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="1st"))
## nb samples relPhyseq: 37
## nb samples clrPhyseq: 37
## nb samples csnPhyseq: 37
## nb samples nzcompPhyseq: 37
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.1st <- plotDistances2D(dlist=dist.mars.1st,
relPhyseq = subset_samples(physeq.rel, Collection=="1st"),
clrPhyseq = subset_samples(physeq.clr, Collection=="1st"),
csnPhyseq = subset_samples(physeq.CSN, Collection=="1st"),
nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="1st"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df.1st, aes(Axis.1, Axis.2, color=host_disease))+
geom_point(size=4, alpha=0.5) + scale_color_manual(values = c('blue', 'red'))+
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_text(size=20))+
labs(color="Disease", title="1st collection time point")
# ggsave(file.path(path.plots, "distances4_MDS_1stcollection.jpg"), height = 4, width = 15)
#________________
# ALL DATA
#________________
# Get the distances & the plot data
dist.mars <- getDistances()
## nb samples relPhyseq: 69
## nb samples clrPhyseq: 69
## nb samples csnPhyseq: 69
## nb samples nzcompPhyseq: 69
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df <- plotDistances2D(dlist=dist.mars)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
p1 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
geom_point(size=4, alpha=0.5) + scale_color_manual(values = c('blue', 'red'))+
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_text(size=20),
axis.title.x = element_blank())+
labs(color="Disease")
p2 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_subtype))+
geom_point(size=4, alpha=0.5) + scale_color_manual(values = c('blue', 'brown', 'red'))+
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_text(size=20),
axis.title.x = element_blank())+
labs(color="Disease")
p3 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=Collection))+
geom_line(aes(group=host_ID), color="black", lwd=0.1)+
geom_point(size=4, alpha=0.7) +
scale_color_manual(values = c('#D95F02', '#7570B3'))+
facet_wrap(distance~., scales='free', nrow=1)+
theme_bw()+
theme(strip.text.x = element_blank())+
labs(color="Collection ")
ggpubr::ggarrange(p1,p2,p3, nrow=3)
# ggsave(file.path(path.plots, "distances4_MDS_all.jpg"), height = 10, width = 15)
For better visualization, we will also take a glance at reduction to 3D.
#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist){
# Reset parameters
mds.3D <- NULL
xyz <- NULL
fig.3D <- NULL
# Reduce distance matrix to 3 dimensions
set.seed(123)
mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
# Plot
fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
color=sample_data(physeq.mars)$host_disease, colors = c("blue", "red"))%>%
layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
return(fig.3D)
}
Now let’s plot!
plotDistances3D(dist.mars$UniF, "UniFrac")
## Warning in metaMDS(d, method = "MDS", k = 3, trace = 0): stress is (nearly)
## zero: you may have insufficient data
plotDistances3D(dist.mars$Ait, "Aitchison")
plotDistances3D(dist.mars$Canb, "Canberra")
plotDistances3D(dist.mars$Bray, "Bray-Curtis")
# For heatmaps: have group color
matcol <- data.frame(phenotype = sample_data(physeq.mars)[,"host_disease"],
host_subtype = sample_data(physeq.mars)[,"host_subtype"],
collection = sample_data(physeq.mars)[,"Collection"])
# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
# Initialize variables
i=1
plist <- vector("list", 4)
names(plist) <- names(dlist)
# Loop through distances
for(d in dlist){
plist[[i]] <- pheatmap(as.matrix(d),
clustering_distance_rows = d,
clustering_distance_cols = d,
fontsize = fontsize,
show_rownames = F,
show_colnames = F,
annotation_col = matcol,
# annotation_row = matcol,
annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red'),
host_subtype = c('HC'='black', 'IBS-M'='orange', 'IBS-C'='brown', 'IBS-D'='red'),
Collection = c("1st"='#D95F02', '2nd'='#7570B3')),
cluster_rows = T,
cluster_cols = T,
clustering_method = 'complete', # hc method
main = names(dlist)[i]) # have name of distance as title
i <- i+1
}
return(plist)
}
# Get the heatmaps
heatmp.mars <- plotHeatmaps(dlist = dist.mars, fontsize = 8)
Figure 1 - Comparison of the Bray-Curtis beta-diversity dispersion
# get useful metadata
metadata <- sample_data(physeq.mars) %>%
as_tibble() %>%
select(Run, host_disease, host_subtype, host_ID)
# Build dataframe with Bray-Curtis distance between IBS-HC, IBS-IBS, HC-HC
bc.comp <- melt(as.matrix(dist.mars$Bray), varnames = c("row", "col")) %>%
inner_join(metadata, by=c("row"="Run")) %>%
dplyr::rename(row_disease=host_disease, row_subtype=host_subtype, row_hostID=host_ID) %>%
inner_join(metadata, by=c("col"="Run")) %>%
dplyr::rename(col_disease=host_disease, col_subtype=host_subtype, col_hostID=host_ID) %>%
# Keep only comparison within subject (row_hostID == col_hostID), removing comparison with same sample (BC=0)
filter(row_hostID == col_hostID) %>%
filter(value != 0) %>%
# BC(SampleA,SampleB) == BC(SampleB,SampleA), so remove duplicates
group_by(row_hostID) %>%
summarise_all(first) %>%
ungroup() %>%
select(-row_hostID, -col_hostID) %>%
# Classify comparison as between disease subtypes
mutate(compare_subtype=ifelse(row_subtype == "HC" & col_subtype=="HC", "Within healthy",
ifelse(row_subtype == "IBS-C" & col_subtype=="IBS-C", "Within IBS-C",
ifelse(row_subtype == "IBS-D" & col_subtype=="IBS-D", "Within IBS-D", "Unknown")))) %>%
mutate(compare_subtype=factor(compare_subtype, levels=c("Within IBS-C", "Within IBS-D", "Within healthy")))
# Sanity check
bc.comp %>%
dplyr::count(row_subtype, col_subtype, compare_subtype)
# Plot fig1J
ggplot(bc.comp, aes(x=compare_subtype, y=value))+
geom_boxplot(width=0.2, outlier.shape = NA)+
geom_jitter(aes(color=compare_subtype), size=3, width=0.01, alpha=0.7)+
scale_color_manual(values=c("#ff7f00", "#377eb8", "#999999"))+
theme_cowplot()+
theme(axis.text.x = element_text(angle=45, hjust=1),
legend.position="none")+
labs(x='', y='Within subject BC dissimilarity')
(J) Community variability within each group based on mean Bray Curtis Distance (HC versus IBS-C, p value = 0.02, ANOVA Tukey HSD, n = 10, 11, and 9 mucosal microbiome samples for IBS-C, IBS-D, and HC, respectively).
As a proxy of the fig1G, let’s compute the between-subject BC distance.
#___________________________________________________________________
# Build dataframe with Bray-Curtis distance between IBS-HC, IBS-IBS, HC-HC
bc.comp <- melt(as.matrix(dist.mars$Bray), varnames = c("row", "col")) %>%
inner_join(metadata, by=c("row"="Run")) %>%
dplyr::rename(row_disease=host_disease, row_subtype=host_subtype, row_hostID=host_ID) %>%
inner_join(metadata, by=c("col"="Run")) %>%
dplyr::rename(col_disease=host_disease, col_subtype=host_subtype, col_hostID=host_ID) %>%
# Keep only comparison between different subjects (row_hostID != col_hostID)
filter(row_hostID != col_hostID) %>%
# Remove duplicate values (comparison(A,B) = comparison(B,A), so remove one of them)
rowwise() %>%
mutate(comp = paste(sort(c(row, col)), collapse="-")) %>%
ungroup() %>%
group_by(comp) %>%
summarise_all(first) %>%
ungroup() %>%
select(-comp) %>%
# Classify comparison as between diseases, between Healthy, or between IBS samples
mutate(compare_disease=ifelse(row_disease != col_disease, "Between",
ifelse(row_disease == "Healthy", "Healthy",
ifelse(row_disease == "IBS", "IBS", "Unknown")))) %>%
mutate(compare_disease=factor(compare_disease, levels=c("Between", "IBS", "Healthy"))) %>%
# Classify comparison as between disease subtypes
mutate(compare_subtype=ifelse(row_subtype == "HC" & col_subtype=="IBS-C", "Healthy_IBS-C",
ifelse(row_subtype == "IBS-C" & col_subtype=="HC", "Healthy_IBS-C",
ifelse(row_subtype == "HC" & col_subtype=="IBS-D", "Healthy_IBS-D",
ifelse(row_subtype == "IBS-D" & col_subtype=="HC", "Healthy_IBS-D",
ifelse(row_subtype == "IBS-D" & col_subtype=="IBS-C", "Between IBS subtypes",
ifelse(row_subtype == "IBS-C" & col_subtype=="IBS-D", "Between IBS subtypes",
ifelse(row_subtype == "HC" & col_subtype=="HC", "Within healthy",
ifelse(row_subtype == "IBS-C" & col_subtype=="IBS-C", "Within IBS-C",
ifelse(row_subtype == "IBS-D" & col_subtype=="IBS-D", "Within IBS-D", "Unknown")))))))))) %>%
mutate(compare_subtype=factor(compare_subtype, levels=c("Within IBS-C", "Healthy_IBS-C", "Within healthy", "Healthy_IBS-D", "Within IBS-D", "Between IBS subtypes")))
# Sanity check
bc.comp %>%
dplyr::count(row_disease, col_disease, compare_disease)
bc.comp %>%
dplyr::count(row_subtype, col_subtype, compare_subtype)
# Attempt at reproducing fig1G
ggplot(bc.comp, aes(x=compare_subtype, y=value))+
geom_boxplot(width=0.2, outlier.shape = NA)+
geom_jitter(aes(color=compare_subtype), size=2, width=0.01, alpha=0.5)+
scale_color_manual(values=c("#ff7f00", "#a1d76a", "#999999", "#9ecae1", "#2171b5", "#807dba"))+
theme_cowplot()+
theme(axis.text.x = element_text(angle=45, hjust=1),
legend.position="none")+
labs(x='', y='Bray-Curtis dissimilarity')
(G) Community variability determined by the mean within-subject Bray Curtis distance (within-IBS-D versus within-IBS-C, p = < 0.005, ANOVA Tukey, n = 22, 46, 24, 53, and 29, Bray-Curtis distances between stool samples of the same subject for within-IBS-C, Healthy versus IBS-C, within-Healthy, Healthy versus IBS-D, and within-IBS-D, respectively).
Just as personal interest, let’s look at the disease level (IBS-HC-IBSvsHC):
# Plot within IBS, within Healthy, and between IBS-Healthy BC dissimilarity
ggplot(bc.comp, aes(x=compare_disease, y=value))+
geom_violin()+
geom_boxplot(width=0.1, outlier.shape = NA)+
geom_jitter(size=0.1, width=0.01)+
theme_cowplot()+
labs(x='', y='Bray-Curtis dissimilarity')